Board Game Behavior: An Analysis of Demographics and Gameplay Preferences¶

Documentation¶

Initial Project Proposal (11/8/23)

Milestone One (11/18/23)

Data Processing and Modeling¶

In [1]:
!python -m pip install pandas
!python -m pip install openpyxl
!python -m pip install seaborn
!python -m pip install scikit-learn
Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Requirement already satisfied: python-dateutil>=2.8.2 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (2023.3.post1)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 341.8/341.8 kB 5.8 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.20.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas) (1.24.4)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.4/12.4 MB 97.6 MB/s eta 0:00:00
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.3 tzdata-2023.3

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 250.0/250.0 kB 8.6 MB/s eta 0:00:00
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (1.24.4)
Requirement already satisfied: pandas>=1.2 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (2.0.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from seaborn) (3.7.4)
Requirement already satisfied: contourpy>=1.0.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.1.1)
Requirement already satisfied: cycler>=0.10 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (4.46.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (23.2)
Requirement already satisfied: pillow>=6.2.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (10.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (3.1.1)
Requirement already satisfied: python-dateutil>=2.7 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (2.8.2)
Requirement already satisfied: importlib-resources>=3.2.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from matplotlib!=3.6.1,>=3.3->seaborn) (6.1.1)
Requirement already satisfied: pytz>=2020.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas>=1.2->seaborn) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from pandas>=1.2->seaborn) (2023.3)
Requirement already satisfied: zipp>=3.1.0 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from importlib-resources>=3.2.0->matplotlib!=3.6.1,>=3.3->seaborn) (3.17.0)
Requirement already satisfied: six>=1.5 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.3->seaborn) (1.16.0)
Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 294.6/294.6 kB 11.4 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip
Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Requirement already satisfied: numpy<2.0,>=1.17.3 in /opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages (from scikit-learn) (1.24.4)
Collecting scipy>=1.5.0 (from scikit-learn)
  Downloading scipy-1.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 73.3 MB/s eta 0:00:00
Collecting joblib>=1.1.1 (from scikit-learn)
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.1 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.1/11.1 MB 106.3 MB/s eta 0:00:00
Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 302.2/302.2 kB 56.1 MB/s eta 0:00:00
Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.2 scipy-1.10.1 threadpoolctl-3.2.0

[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: pip install --upgrade pip

Data Cleaning¶

In [2]:
import pandas as pd
import numpy as np

raw_df = pd.read_excel("datasets/rawdata.xlsx",keep_default_na=False)

raw_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 22 columns):
 #   Column                                                                                                                                               Non-Null Count  Dtype         
---  ------                                                                                                                                               --------------  -----         
 0   ID                                                                                                                                                   244 non-null    int64         
 1   Start time                                                                                                                                           244 non-null    datetime64[ns]
 2   Completion time                                                                                                                                      244 non-null    datetime64[ns]
 3   Email                                                                                                                                                244 non-null    object        
 4   Name                                                                                                                                                 244 non-null    object        
 5   Last modified time                                                                                                                                   244 non-null    object        
 6   I am a:                                                                                                                                              244 non-null    object        
 7   What is your Gender Identity?                                                                                                                        244 non-null    object        
 8   What is your Age?
(this field may remain blank)                                                                                                      244 non-null    object        
 9   What is your Race?                                                                                                                                   244 non-null    object        
 10  What is your current Employment Status?                                                                                                              244 non-null    object        
 11  Do you wear glasses or contact lenses for vision correction?                                                                                         244 non-null    object        
 12  What best describes your current religious or spiritual beliefs, if any?
                                                                            244 non-null    object        
 13  What is your Area of Study or Major? 
(this field may remain blank)
                                                                                 244 non-null    object        
 14  Do you enjoy playing board games, card games, or similar tabletop games?                                                                             244 non-null    object        
 15  How many board/card games do you own? 
(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.)  244 non-null    object        
 16  About how frequently do you play board/card games?                                                                                                   244 non-null    object        
 17  Select what best describes your engagement and style when it comes to board and card games?                                                          244 non-null    object        
 18  I prefer games that include elements of:
(select all that you prefer)                                                                                244 non-null    object        
 19  List up to 15 board/card games that you enjoy playing. 
(abide by the format: "Game 1", "Game 2", "Game 3")
(this field may remain blank)
           244 non-null    object        
 20  On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable?                    244 non-null    object        
 21  Select your all genres of board/card games you enjoy.
(if your preferred genre(s) is/are not listed, please list them in the "other" selection)      244 non-null    object        
dtypes: datetime64[ns](2), int64(1), object(19)
memory usage: 42.1+ KB
Drop Metadata¶
In [3]:
dropped_df = raw_df.drop(columns=["ID","Start time","Completion time","Email","Name","Last modified time"])

dropped_df[:0]
Out[3]:
I am a: What is your Gender Identity? What is your Age?\n(this field may remain blank) What is your Race? What is your current Employment Status? Do you wear glasses or contact lenses for vision correction? What best describes your current religious or spiritual beliefs, if any?\n What is your Area of Study or Major? \n(this field may remain blank)\n Do you enjoy playing board games, card games, or similar tabletop games? How many board/card games do you own? \n(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.) About how frequently do you play board/card games? Select what best describes your engagement and style when it comes to board and card games? I prefer games that include elements of:\n(select all that you prefer) List up to 15 board/card games that you enjoy playing. \n(abide by the format: "Game 1", "Game 2", "Game 3")\n(this field may remain blank)\n On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable? Select your all genres of board/card games you enjoy.\n(if your preferred genre(s) is/are not listed, please list them in the "other" selection)

Rename Raw Fields from Question to Concise Description¶

In [4]:
renamed_df = dropped_df.rename(columns={
    "I am a:": "WWUStatus",
    "What is your Gender Identity?": "Gender",
    "What is your Age?\n(this field may remain blank)": "Age",
    "What is your Race?": "Race(s)",
    "What is your current Employment Status?": "EmploymentStatus",
    "Do you wear glasses or contact lenses for vision correction?": "Vision",
    "What best describes your current religious or spiritual beliefs, if any?\n": "Religiosity",
    "What is your Area of Study or Major? \n(this field may remain blank)\n": "AOS",
    "Do you enjoy playing board games, card games, or similar tabletop games?": "EnjoysBoardGames",
    "How many board/card games do you own? \n(All non-roleplay table top games are included, i.e., chess, playing cards, Uno, Magic: The Gathering, etc.)": "BoardGamesOwned",
    "About how frequently do you play board/card games?": "FrequencyOfPlay",
    "Select what best describes your engagement and style when it comes to board and card games? ": "Style",
    "I prefer games that include elements of:\n(select all that you prefer)": "PreferredElements",
    "List up to 15 board/card games that you enjoy playing. \n(abide by the format: \"Game 1\", \"Game 2\", \"Game 3\")\n(this field may remain blank)\n": "EnjoyedBoardGames",
    "On a scale from 1 to 10, how much do you enjoy playing chess, with 1 being not enjoyable at all and 10 being extremely enjoyable?": "ChessRating",
    "Select your all genres of board/card games you enjoy.\n(if your preferred genre(s) is/are not listed, please list them in the \"other\" selection)": "EnjoyedGenres"
})

renamed_df[:0]
Out[4]:
WWUStatus Gender Age Race(s) EmploymentStatus Vision Religiosity AOS EnjoysBoardGames BoardGamesOwned FrequencyOfPlay Style PreferredElements EnjoyedBoardGames ChessRating EnjoyedGenres

Multiselect Binary/Boolean Features¶

Race(s)¶
In [5]:
renamed_df["Race(s)"].value_counts()
Out[5]:
Race(s)
White;                                           185
Asian;                                            14
Hispanic or Latino;                                9
White;Asian;                                       6
White;Hispanic or Latino;                          5
Black or African American;White;                   4
White;Black or African American;                   3
White;Native Hawaiian or Pacific Islander;         2
Black or African American;                         2
Asian;Hispanic or Latino;                          2
Black or African American;White;Asian;             1
Black or African American;Hispanic or Latino;      1
White;Native American or American Indian;          1
Multiracial;                                       1
Mixed ethnicity;                                   1
Native American or American Indian;                1
Asian;Filipino;                                    1
Hispanic or Latino;White;                          1
Asian;White;                                       1
White;Black or African American;Asian;             1
Prefer not to say;                                 1
Native American or American Indian;White;          1
Name: count, dtype: int64
In [6]:
raceClean_df = renamed_df.copy()

unique_races = set()
for elements in raceClean_df["Race(s)"].dropna():
    unique_races.update(elements.split(";"))

unique_races = list(unique_races)
unique_races.sort()
In [7]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_races:
    raceClean_df[(element + "IsRace").replace(" ", "")] = raceClean_df["Race(s)"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

raceClean_df = raceClean_df.drop(columns={"Race(s)","IsRace"})
PreferredElements¶
In [8]:
raceClean_df["PreferredElements"].value_counts()
Out[8]:
PreferredElements
Conflict/Competition;Strategy;                                                                                                                 7
Cooperation;Conflict/Competition;Luck;Strategy;Social Deduction/Hidden Role;Heavy/Immersive Theming;Puzzle-Solving;Trivia;Party/Low-Stakes;    6
Party/Low-Stakes;                                                                                                                              4
Conflict/Competition;Strategy;Trivia;                                                                                                          4
Cooperation;Conflict/Competition;Luck;Strategy;Puzzle-Solving;Party/Low-Stakes;                                                                3
                                                                                                                                              ..
Cooperation;Conflict/Competition;Strategy;Luck;                                                                                                1
Cooperation;Strategy;Luck;Conflict/Competition;Social Deduction/Hidden Role;Trivia;Party/Low-Stakes;                                           1
Luck;Strategy;Conflict/Competition;Party/Low-Stakes;Trivia;                                                                                    1
Cooperation;Luck;Conflict/Competition;                                                                                                         1
Social Deduction/Hidden Role;Party/Low-Stakes;                                                                                                 1
Name: count, Length: 202, dtype: int64
In [9]:
preferredElementsClean_df = raceClean_df.copy()

unique_preferred_elements = set()
for elements in preferredElementsClean_df["PreferredElements"].dropna():
    unique_preferred_elements.update(elements.split(";"))
unique_preferred_elements

unique_preferred_elements = list(unique_preferred_elements)
unique_preferred_elements.sort()
In [10]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_preferred_elements:
    preferredElementsClean_df[(element + "IsPreferredElement").replace(" ", "")] = preferredElementsClean_df["PreferredElements"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

preferredElementsClean_df = preferredElementsClean_df.drop(columns={"PreferredElements","IsPreferredElement"})
EnjoyedGenres¶
In [11]:
enjoyedGenresClean_df = preferredElementsClean_df.copy()

unique_enjoyed_genres = set()
for elements in enjoyedGenresClean_df["EnjoyedGenres"].dropna():
    unique_enjoyed_genres.update(elements.split(";"))
unique_enjoyed_genres

unique_enjoyed_genres = list(unique_enjoyed_genres)
unique_enjoyed_genres.sort()
In [12]:
# Create a new column for each unique element with binary values (0 or 1 or None)
for element in unique_enjoyed_genres:
    enjoyedGenresClean_df[(element + "IsEnjoyedGenre").replace(" ", "")] = enjoyedGenresClean_df["EnjoyedGenres"].apply(
        lambda x: int(element in x) if pd.notna(x) else None
    )

enjoyedGenresClean_df = enjoyedGenresClean_df.drop(columns={"EnjoyedGenres","IsEnjoyedGenre"})
In [13]:
enjoyedGenresClean_df.columns.tolist()
Out[13]:
['WWUStatus',
 'Gender',
 'Age',
 'EmploymentStatus',
 'Vision',
 'Religiosity',
 'AOS',
 'EnjoysBoardGames',
 'BoardGamesOwned',
 'FrequencyOfPlay',
 'Style',
 'EnjoyedBoardGames',
 'ChessRating',
 'AsianIsRace',
 'BlackorAfricanAmericanIsRace',
 'FilipinoIsRace',
 'HispanicorLatinoIsRace',
 'MixedethnicityIsRace',
 'MultiracialIsRace',
 'NativeAmericanorAmericanIndianIsRace',
 'NativeHawaiianorPacificIslanderIsRace',
 'PrefernottosayIsRace',
 'WhiteIsRace',
 'Conflict/CompetitionIsPreferredElement',
 'CooperationIsPreferredElement',
 'Heavy/ImmersiveThemingIsPreferredElement',
 'LuckIsPreferredElement',
 'Party/Low-StakesIsPreferredElement',
 'Puzzle-SolvingIsPreferredElement',
 'SocialDeduction/HiddenRoleIsPreferredElement',
 'StrategyIsPreferredElement',
 'TriviaIsPreferredElement',
 'AbstractStrategyIsEnjoyedGenre',
 'AdventureIsEnjoyedGenre',
 'AnimalsIsEnjoyedGenre',
 'AuctionIsEnjoyedGenre',
 'CardIsEnjoyedGenre',
 'CardDraftingIsEnjoyedGenre',
 'CivilizationIsEnjoyedGenre',
 'Cooperative\xa0IsEnjoyedGenre',
 'Deck-Building\xa0IsEnjoyedGenre',
 'DeductionIsEnjoyedGenre',
 'EconomicIsEnjoyedGenre',
 'EducationalIsEnjoyedGenre',
 'ExplorationIsEnjoyedGenre',
 'FantasyIsEnjoyedGenre',
 'FarmingIsEnjoyedGenre',
 'FightingIsEnjoyedGenre',
 'HorrorIsEnjoyedGenre',
 'LuckIsEnjoyedGenre',
 'MedievalIsEnjoyedGenre',
 'MemoryIsEnjoyedGenre',
 'MiniaturesIsEnjoyedGenre',
 'Party\xa0IsEnjoyedGenre',
 'PiratesIsEnjoyedGenre',
 'PoliticalIsEnjoyedGenre',
 'PuzzleIsEnjoyedGenre',
 'RacingIsEnjoyedGenre',
 'Role-Playing\xa0IsEnjoyedGenre',
 'RollandMove\xa0IsEnjoyedGenre',
 'ScienceFictionIsEnjoyedGenre',
 'SocialDeduction/HiddenRoleIsEnjoyedGenre',
 'SportsIsEnjoyedGenre',
 'StrategyIsEnjoyedGenre',
 'TerritoryBuildingIsEnjoyedGenre',
 'Tile-Laying\xa0IsEnjoyedGenre',
 'TrainsIsEnjoyedGenre',
 'TransportationIsEnjoyedGenre',
 'TravelIsEnjoyedGenre',
 'TriviaIsEnjoyedGenre',
 'War\xa0IsEnjoyedGenre',
 'Word\xa0IsEnjoyedGenre',
 'WorkerPlacementIsEnjoyedGenre',
 'WorldWarIIIsEnjoyedGenre',
 'ZombiesIsEnjoyedGenre']
In [14]:
reformatColumns_df = enjoyedGenresClean_df.rename(columns=lambda x: x.replace("\xa0", ""))

reformatColumns_df.columns.tolist()
Out[14]:
['WWUStatus',
 'Gender',
 'Age',
 'EmploymentStatus',
 'Vision',
 'Religiosity',
 'AOS',
 'EnjoysBoardGames',
 'BoardGamesOwned',
 'FrequencyOfPlay',
 'Style',
 'EnjoyedBoardGames',
 'ChessRating',
 'AsianIsRace',
 'BlackorAfricanAmericanIsRace',
 'FilipinoIsRace',
 'HispanicorLatinoIsRace',
 'MixedethnicityIsRace',
 'MultiracialIsRace',
 'NativeAmericanorAmericanIndianIsRace',
 'NativeHawaiianorPacificIslanderIsRace',
 'PrefernottosayIsRace',
 'WhiteIsRace',
 'Conflict/CompetitionIsPreferredElement',
 'CooperationIsPreferredElement',
 'Heavy/ImmersiveThemingIsPreferredElement',
 'LuckIsPreferredElement',
 'Party/Low-StakesIsPreferredElement',
 'Puzzle-SolvingIsPreferredElement',
 'SocialDeduction/HiddenRoleIsPreferredElement',
 'StrategyIsPreferredElement',
 'TriviaIsPreferredElement',
 'AbstractStrategyIsEnjoyedGenre',
 'AdventureIsEnjoyedGenre',
 'AnimalsIsEnjoyedGenre',
 'AuctionIsEnjoyedGenre',
 'CardIsEnjoyedGenre',
 'CardDraftingIsEnjoyedGenre',
 'CivilizationIsEnjoyedGenre',
 'CooperativeIsEnjoyedGenre',
 'Deck-BuildingIsEnjoyedGenre',
 'DeductionIsEnjoyedGenre',
 'EconomicIsEnjoyedGenre',
 'EducationalIsEnjoyedGenre',
 'ExplorationIsEnjoyedGenre',
 'FantasyIsEnjoyedGenre',
 'FarmingIsEnjoyedGenre',
 'FightingIsEnjoyedGenre',
 'HorrorIsEnjoyedGenre',
 'LuckIsEnjoyedGenre',
 'MedievalIsEnjoyedGenre',
 'MemoryIsEnjoyedGenre',
 'MiniaturesIsEnjoyedGenre',
 'PartyIsEnjoyedGenre',
 'PiratesIsEnjoyedGenre',
 'PoliticalIsEnjoyedGenre',
 'PuzzleIsEnjoyedGenre',
 'RacingIsEnjoyedGenre',
 'Role-PlayingIsEnjoyedGenre',
 'RollandMoveIsEnjoyedGenre',
 'ScienceFictionIsEnjoyedGenre',
 'SocialDeduction/HiddenRoleIsEnjoyedGenre',
 'SportsIsEnjoyedGenre',
 'StrategyIsEnjoyedGenre',
 'TerritoryBuildingIsEnjoyedGenre',
 'Tile-LayingIsEnjoyedGenre',
 'TrainsIsEnjoyedGenre',
 'TransportationIsEnjoyedGenre',
 'TravelIsEnjoyedGenre',
 'TriviaIsEnjoyedGenre',
 'WarIsEnjoyedGenre',
 'WordIsEnjoyedGenre',
 'WorkerPlacementIsEnjoyedGenre',
 'WorldWarIIIsEnjoyedGenre',
 'ZombiesIsEnjoyedGenre']

AOS (manual string bucketing)¶

In [15]:
aos_df = reformatColumns_df
aos_df["AOS"].str.lower().unique()
Out[15]:
array(['computer science ', 'data science ', 'data science', 'eece',
       'electrical and computer engineering ', 'statistics',
       'environmental studies', 'applied mathematics', 'chemistry', 'rml',
       'political science', 'elementary education', 'english',
       'music education', 'n/a', 'art', 'psychology', '',
       'psych (probably)', 'music', 'environmental science - toxicology',
       'history/museum studies', 'elementary ed',
       'environmental science ', 'mathematics ', 'business', 'biochem ',
       'vocal performance ', 'secondary education ', 'business ',
       'linguistics', 'history', 'bio/anth', 'mathematics',
       'marine biology ', 'environmental science',
       'communication disorders', 'engineering ', 'biochem',
       'kinesiology', 'economics and mathematics',
       'music education and german', 'art p-12',
       'chemistry either organic or inorganic', 'math',
       'electrical engineering',
       'undecided but leaning towards engineering', 'medicine ',
       'rec management ', 'economics ', 'geology', 'visual journalism ',
       'environmental studies ', 'biology/math',
       'behavioral neuroscience', 'electrical engineering ',
       'computer science', 'geology (paleoclimate)',
       'marine biology and theater production', 'anthropology',
       'biology ', 'management information systems ', 'marine bio',
       'history/holocaust & genocide studies', 'sped & eled',
       'visual journalism', 'anthropology, communication studies',
       'theatre', 'studio art',
       'urban planning and sustainable development',
       'urban planning and sustainable development ', 'history ',
       'art and design ', 'kinesiology ', 'spanish ', 'biochemistry ',
       'art studio', 'art ed', 'comm', 'early childhood education ',
       'creative writing', 'neuroscience ', 'marine science ',
       'marketing ', 'behavioral neuroscience ', 'pre nursing ',
       'engineering', 'graphic design', 'undecided',
       'english literature with a teaching emphasis',
       'political science ', 'international business ',
       'communication studies', 'dance',
       'narrative and folklore studies (fairhaven major) ', 'psychology ',
       'anthropology ', 'pre med and psychology ', 'biology',
       'education and public relations', 'economics/mathematics',
       'communications', 'art studio (ba), art history',
       'elementary education ', 'archaeology ', 'theatre/education',
       'marketing', 'business and sustainability ', 'biochemistry',
       'environmental studies: eco-social justice and education emphasis',
       'education ', 'education', 'envs ',
       'mathematics secondary education', 'music composition',
       'sociology ', 'stem', 'linguistics ', 'fairhaven', 'fairhaven ',
       'behavioural neuroscience', 'english lit',
       'food equity and sustainable agriculture ',
       'art history and museum studies', 'japanese language ',
       'graphic design and marketing ', 'music performance major',
       'environment studies', 'business or elementary education ',
       'marine and coastal science',
       'undeclared, strongly thinking about history ', 'public health',
       'energy policy and management ', 'undeclared', 'fine arts',
       'undecided ', 'english, history of culture ',
       'psychology and elementary education ',
       'communication science and disordwrs', 'anthropology/history',
       'special education and elementary education ', 'ibus',
       'energy science', 'politics/philosophy/economics', 'studio art ',
       'history/social studies', 'energy'], dtype=object)
In [16]:
manual_mapping_aos = {
    "computer science": "STEM",
    "data science": "STEM",
    "eece": "STEM",
    "electrical and computer engineering": "STEM",
    "statistics": "STEM",
    "environmental studies": "STEM",
    "applied mathematics": "STEM",
    "chemistry": "STEM",
    "rml": "Other",
    "political science": "Social Studies",
    "elementary education": "Education",
    "english": "Arts & Humanities",
    "music education": "Arts & Humanities",
    "nan": "Other",
    "art": "Arts & Humanities",
    "psychology": "Social Studies",
    "psych (probably)": "Social Studies",
    "music": "Arts & Humanities",
    "environmental science - toxicology": "STEM",
    "history/museum studies": "Arts & Humanities",
    "elementary ed": "Education",
    "environmental science": "STEM",
    "mathematics": "STEM",
    "business": "Business",
    "biochem": "STEM",
    "vocal performance": "Arts & Humanities",
    "secondary education": "Education",
    "linguistics": "Arts & Humanities",
    "history": "Arts & Humanities",
    "bio/anth": "STEM",
    "marine biology": "STEM",
    "communication disorders": "Health & Medicine",
    "engineering": "STEM",
    "kinesiology": "Health & Medicine",
    "economics and mathematics": "STEM",
    "music education and german": "Arts & Humanities",
    "art p-12": "Arts & Humanities",
    "chemistry either organic or inorganic": "STEM",
    "math": "STEM",
    "electrical engineering": "STEM",
    "undecided but leaning towards engineering": "Unknown",  # Assuming lean towards STEM, but no exact match
    "medicine": "Health & Medicine",
    "rec management": "Other",  # Assuming Recreation Management
    "economics": "Social Studies",  # Close to "economics and mathematics", but economics is often considered Social Studies
    "geology": "STEM",
    "geology (paleoclimate)": "STEM",
    "visual journalism": "Arts & Humanities",
    "biology/math": "STEM",  # Combination of two STEM fields
    "behavioral neuroscience": "STEM",  # Close to "psychology" which is Social Studies, but has a heavy STEM component
    "marine biology and theater production": "STEM",  # Marine biology is STEM, theater production could be Arts, but STEM is the primary
    "anthropology": "Social Studies",
    "biology": "STEM",
    "management information systems": "Business",
    "marine bio": "STEM",
    "history/holocaust & genocide studies": "Arts & Humanities",
    "sped & eled": "Education",  # Assuming this refers to special education & elementary education
    "anthropology, communication studies": "Social Studies",
    "theatre": "Arts & Humanities",
    "studio art": "Arts & Humanities",
    "urban planning and sustainable development": "Other",  # Not a clear category, could be Social Studies or another category
    "art and design": "Arts & Humanities",
    "spanish": "Arts & Humanities",  # Language studies are often classified here
    "biochemistry": "STEM",
    "art studio": "Arts & Humanities",
    "art ed": "Arts & Humanities",
    "comm": "Other",  # Assuming "communication", but not explicitly listed, could fit Social Studies or Business,
    
    "environmental studies: eco-social justice and education emphasis": "STEM",  # Falls under Environmental Studies
    "communications": "Other",  # Often classified as Arts & Humanities
    "theatre/education": "Education",  # Falls under Education
    "undecided": "Unknown",  # Assuming still undecided as before
    "marketing": "Business",  # Falls under Business
    "communication studies": "Arts & Humanities",  # Often classified as Arts & Humanities
    "sociology": "Social Studies",  # Social Studies
    "education and public relations": "Education",  # Falls under Education
    "pre nursing": "Health & Medicine",  # Falls under Health & Medicine
    "economics/mathematics": "STEM",  # Combination of Economics and Mathematics - falls under STEM
    "mathematics secondary education": "Education",  # Falls under Education
    "dance": "Arts & Humanities",  # Often classified as Arts & Humanities
    "art studio (ba), art history": "Arts & Humanities",  # Falls under Arts & Humanities
    "narrative and folklore studies (fairhaven major)": "Arts & Humanities",  # Falls under Arts & Humanities
    "pre med and psychology": "Health & Medicine",  # Falls under Health & Medicine
    "archaeology": "Social Studies",  # Falls under Social Studies
    "neuroscience": "STEM",  # Falls under STEM
    "english literature with a teaching emphasis": "Arts & Humanities",  # Falls under Arts & Humanities
    "marine science": "STEM",  # Falls under STEM
    "fairhaven": "Other",  # Falls under Other
    "international business": "Business",  # Falls under Business
    "music composition": "Arts & Humanities",  # Falls under Arts & Humanities
    "creative writing": "Arts & Humanities",  # Falls under Arts & Humanities
    "business and sustainability": "Business",  # Falls under Business
    "early childhood education": "Education",  # Falls under Education
    "graphic design": "Arts & Humanities",  # Falls under Arts & Humanities
    "education": "Education",  # Falls under Education
    "stem": "STEM",  # Falls under STEM
    "envs": "STEM",  # Falls under STEM

    
    "behavioural neuroscience": "STEM",
    "english lit": "Arts & Humanities",
    "food equity and sustainable agriculture": "Other",  # Could be Social Studies, Business, or even STEM, unclear
    "art history and museum studies": "Arts & Humanities",
    "japanese language": "Arts & Humanities",
    "graphic design and marketing": "Arts & Humanities",  # Graphic Design is often in Arts & Humanities, Marketing is Business, but first seems primary
    "music performance major": "Arts & Humanities",
    "environment studies": "STEM",
    "business or elementary education": "Unknown",  # Could be either Business or Education 
    "marine and coastal science": "STEM",
    "undeclared, strongly thinking about history": "Unknown",  # Assuming lean towards Arts & Humanities, but no exact match
    "public health": "Health & Medicine",
    "energy policy and management": "Other",  # Could be Business, Social Studies, or STEM
    "undeclared": "Unknown",
    "fine arts": "Arts & Humanities",
    "english, history of culture": "Arts & Humanities",
    "psychology and elementary education": "Education",  # Both Psychology and Elementary Education could be Education
    "communication science and disordwrs": "Health & Medicine",
    "anthropology/history": "Social Studies",
    "special education and elementary education": "Education",
    "ibus": "Business",  # Assuming International Business
    "energy science": "STEM",
    "politics/philosophy/economics": "Social Studies",  # Combination of three Social Studies fields
    "history/social studies": "Social Studies",
    "energy": "STEM"  # Energy could be a part of STEM disciplines like Physics or Environmental Sciences
}


noncategorized_data = []

def categorize_aos_string(aos):
    if pd.isnull(aos) or aos == "n/a" or aos == "" or aos == "N/A":
        return "Unknown"  # For handling NaN values
    field_clean = aos.lower().rstrip()
    for key, category in manual_mapping_aos.items():
        if key == field_clean:
            return category
    noncategorized_data.append(field_clean)    


aos_df["AOSCat"] = aos_df["AOS"].apply(categorize_aos_string)

if (len(noncategorized_data) > 0):
    raise ValueError(f"Unknown categories: {noncategorized_data}") 

aos_df["AOSCat"].unique()
Out[16]:
array(['STEM', 'Other', 'Social Studies', 'Education',
       'Arts & Humanities', 'Unknown', 'Business', 'Health & Medicine'],
      dtype=object)

Shorting the Length of Single Select Answer Choices for "Style"¶

In [17]:
style_df = aos_df 

style_df["Style"].unique()
Out[17]:
array(['Not interested in playing board/card games.',
       'Party-only player, primarily playing board/card games at social gatherings or parties.',
       'Strategy-focused player, enjoying games that require planning and tactics, but also participates in more casual games.',
       'Situation-Specific Player, adapts style and enthusiasm based on the specific game or social context. May be casual in some situations and highly strategic in others, depending on the game being played.',
       'None of these describe me.',
       'Casual player, participating for fun and relaxation, without a strong focus on winning. Still willing to engage with complex games.',
       'Simple and straightforward player, preferring uncomplicated games with easy rules.'],
      dtype=object)
In [18]:
player_styles_mapping = {
    "Strategy-focused player, enjoying games that require planning and tactics, but also participates in more casual games.": "Strategic",
    "Situation-Specific Player, adapts style and enthusiasm based on the specific game or social context. May be casual in some situations and highly strategic in others, depending on the game being played.": "Situation-Specific",
    "Casual player, participating for fun and relaxation, without a strong focus on winning. Still willing to engage with complex games.": "Casual",
    "Simple and straightforward player, preferring uncomplicated games with easy rules.": "Simple",
    "Party-only player, primarily playing board/card games at social gatherings or parties.": "Party/Social",
    "Not interested in playing board/card games.": "Not Interested",
    "Never played or never had the opportunity to play board/card games.": "Never Played",
    "None of these describe me.": "Other",
    "Prefer not to say": "Prefer not to say"
}

def map_player_styles(string):
    for key, val in player_styles_mapping.items():
        if key in string:
            return val
    raise ValueError(f"Unknown category: {string}") 

new_style_df = style_df.copy()

new_style_df["Style"] = style_df["Style"].apply(map_player_styles)

new_style_df["Style"].unique()
Out[18]:
array(['Not Interested', 'Party/Social', 'Strategic',
       'Situation-Specific', 'Other', 'Casual', 'Simple'], dtype=object)

Assigning Order to Particular Catagories¶

In [19]:
cat_order_df = new_style_df.copy()

cat_order_df["BoardGamesOwned"] = cat_order_df["BoardGamesOwned"].apply(lambda x: "0" if x == "I do not own any board/card games.\xa0" else x)

cat_order_df["BoardGamesOwned"] = pd.Categorical(cat_order_df["BoardGamesOwned"],
    categories=["Prefer not to say","0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],ordered=True)


cat_order_df["FrequencyOfPlay"] = pd.Categorical(cat_order_df["FrequencyOfPlay"],
    categories=["Daily","Several times a week","Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say",],ordered=True)


cat_order_df["ChessRating"] = cat_order_df["ChessRating"].apply(lambda x: "Unknown" if x == "" else x)

cat_order_df["ChessRating"] = pd.Categorical(cat_order_df["ChessRating"].astype(str),
    categories=["Unknown","1","2","3","4","5","6","7","8","9","10"], ordered=True)

Convert Features to Int¶

In [20]:
convert_df = cat_order_df.copy()
convert_df["Age"] = convert_df["Age"].apply(lambda x: -1 if x == "" else x)
convert_df["Age"] = convert_df["Age"].astype(int)
In [21]:
cleaned_df = convert_df

cleaned_df.to_csv("datasets/cleaned.csv")

Exploratory Data Analysis¶

In [22]:
single_select_features = ['WWUStatus', 'Gender', 'Age', 'EmploymentStatus', 'Vision',
       'Religiosity', 'AOSCat', 'EnjoysBoardGames', 'BoardGamesOwned',
       'FrequencyOfPlay', 'Style', 'ChessRating']

race_bool_features = ['WhiteIsRace','AsianIsRace', 'BlackorAfricanAmericanIsRace', 'FilipinoIsRace',
       'HispanicorLatinoIsRace','MixedethnicityIsRace', 'MultiracialIsRace', 'NativeAmericanorAmericanIndianIsRace',
       'NativeHawaiianorPacificIslanderIsRace', 'PrefernottosayIsRace',]
       
element_bool_features = ['Conflict/CompetitionIsPreferredElement',
       'CooperationIsPreferredElement',
       'Heavy/ImmersiveThemingIsPreferredElement', 'LuckIsPreferredElement',
       'Party/Low-StakesIsPreferredElement',
       'Puzzle-SolvingIsPreferredElement',
       'SocialDeduction/HiddenRoleIsPreferredElement',
       'StrategyIsPreferredElement', 'TriviaIsPreferredElement',]

genre_bool_features = ['AbstractStrategyIsEnjoyedGenre', 'AdventureIsEnjoyedGenre',
       'AnimalsIsEnjoyedGenre', 'AuctionIsEnjoyedGenre', 'CardIsEnjoyedGenre',
       'CardDraftingIsEnjoyedGenre', 'CivilizationIsEnjoyedGenre',
       'CooperativeIsEnjoyedGenre', 'Deck-BuildingIsEnjoyedGenre',
       'DeductionIsEnjoyedGenre', 'EconomicIsEnjoyedGenre',
       'EducationalIsEnjoyedGenre', 'ExplorationIsEnjoyedGenre',
       'FantasyIsEnjoyedGenre', 'FarmingIsEnjoyedGenre',
       'FightingIsEnjoyedGenre', 'HorrorIsEnjoyedGenre', 'LuckIsEnjoyedGenre',
       'MedievalIsEnjoyedGenre', 'MemoryIsEnjoyedGenre',
       'MiniaturesIsEnjoyedGenre', 'PartyIsEnjoyedGenre',
       'PiratesIsEnjoyedGenre', 'PoliticalIsEnjoyedGenre',
       'PuzzleIsEnjoyedGenre', 'RacingIsEnjoyedGenre',
       'Role-PlayingIsEnjoyedGenre', 'RollandMoveIsEnjoyedGenre',
       'ScienceFictionIsEnjoyedGenre',
       'SocialDeduction/HiddenRoleIsEnjoyedGenre', 'SportsIsEnjoyedGenre',
       'StrategyIsEnjoyedGenre', 'TerritoryBuildingIsEnjoyedGenre',
       'Tile-LayingIsEnjoyedGenre', 'TrainsIsEnjoyedGenre',
       'TransportationIsEnjoyedGenre', 'TravelIsEnjoyedGenre',
       'TriviaIsEnjoyedGenre', 'WarIsEnjoyedGenre', 'WordIsEnjoyedGenre',
       'WorkerPlacementIsEnjoyedGenre', 'WorldWarIIIsEnjoyedGenre',
       'ZombiesIsEnjoyedGenre',]

free_form_features = ['AOS','EnjoyedBoardGames']
In [23]:
len(cleaned_df.columns)
Out[23]:
76
In [24]:
len(single_select_features + race_bool_features + element_bool_features + genre_bool_features + free_form_features)
Out[24]:
76
In [25]:
cleaned_df["Religiosity"].value_counts()
Out[25]:
Religiosity
Atheism                                                        59
No specific belief                                             55
Agnosticism                                                    49
Christianity                                                   32
Spiritual, not affiliated with a specific religion             31
Judaism                                                         4
Prefer not to say                                               2
Pagan                                                           1
Islam                                                           1
Toaism                                                          1
Lutheran                                                        1
Hinduism                                                        1
Buddhism                                                        1
pagan                                                           1
Unitarian                                                       1
Science                                                         1
being with oneself in connection to everything                  1
Paganism                                                        1
I believe a god exists but don’t follow any religious texts     1
Name: count, dtype: int64
In [26]:
religious_bucketing = {
    "Christianity": "Religious",
    "Judaism": "Religious",
    "Islam": "Religious",
    "Hinduism": "Religious",
    "Buddhism": "Religious",
    "Lutheran": "Religious",
    "Pagan": "Religious",
    "Paganism": "Religious",
    "Unitarian": "Religious",
    "I believe a god exists but don’t follow any religious texts": "Religious",
    "Toaism": "Religious",
    "pagan": "Religious",

    "Atheism": "Not Religious",
    "No specific belief": "Not Religious",
    "Agnosticism": "Not Religious",
    "Spiritual, not affiliated with a specific religion": "Not Religious",
    "Science": "Not Religious",
    "being with oneself in connection to everything": "Not Religious",
    "Prefer not to say": "Not Religious",
}

Defining Functions for Analyzing Categorical Data¶

In [27]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import pandas as pd



def categories_against_category(df, categories, category):
    for feature in categories:
        print(df.groupby(category)[feature].value_counts())
        # Compute percentage for each category within each cluster
        df_percent = df.groupby(category)[feature].value_counts(normalize=True).rename('Percentage').reset_index()
        df_percent["Percentage"] *= 100  # Convert to %

        # Create seaborn barplot
        plt.figure(figsize=(8, 6))
        sns.barplot(x=feature, y='Percentage', hue=category, data=df_percent)
        plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
        plt.xticks(rotation=90)
        plt.show()

def bools_against_category(df,bools,category):
    # Initialize an empty dataframe to store your percentages
    percentages = pd.DataFrame()

    # Loop through the bool_columns 
    for col in bools:
        # Compute the percentages of True (=1) occurrences in each cluster
        percents = df.groupby(category,observed=True)[col].mean().mul(100).reset_index()
        percents['Feature'] = col
        percents.rename({col: 'Percentage'}, axis=1, inplace=True)

        # Append the computed percentages to your dataframe
        percentages = pd.concat([percentages, percents])

    # Once your dataframe is ready, you can plot it using seaborn
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Feature', y='Percentage', hue=category, data=percentages)
    plt.gca().yaxis.set_major_formatter(mtick.PercentFormatter())
    plt.xticks(rotation=90)
    plt.title(f"Percentage of 'True' Occurrences in Each Feature by {category}")
    plt.show()

# Intended for single-select/categorical feature comparison
# `x` should be a Panda Series
# `y` should be a Panda Series
def plot_count_and_percentage_heatmaps(x, y):
    data = pd.DataFrame({x.name: x, y.name: y})

    count_matrix = pd.crosstab(data[x.name], data[y.name])
    
    percent_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0) * 100

    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # Heatmap for counts
    sns.heatmap(count_matrix, annot=True, fmt="d", cmap="YlGnBu", cbar=True, ax=ax[0])
    ax[0].set_title("Count Matrix of Categories")
    ax[0].set_xlabel(y.name)
    ax[0].set_ylabel(x.name)

    # Heatmap for percentages
    sns.heatmap(percent_matrix, annot=True, fmt=".2f", cmap="YlGnBu", cbar=True, ax=ax[1],vmin=0.0, vmax=100.0)
    ax[1].set_title("Percentage Matrix of Categories")
    ax[1].set_xlabel(y.name)
    ax[1].set_ylabel(x.name)

    plt.tight_layout()
    plt.show()

# Intended for categorical feature comparison against several bool fields
# `data` should be a Pandas Data Frame
# `x` should be a string that refers to the categorical feature in `data`
# `y` should be an array of strings that refer to the several bool features in `data`
def plot_heatmap_of_bool_features_percent(data, x, y):
    df_melted = data.melt(id_vars=x, value_vars=y, var_name="Bool", value_name="True")

    df_pivot = df_melted.groupby([x,"Bool"]).sum().reset_index().pivot(index=x, columns="Bool", values="True")

    total = data[x].value_counts()

    df_pivot = df_pivot.join(total)
    

    df_pivot.loc[:, df_pivot.columns != "count"] = ((df_pivot.loc[:, df_pivot.columns != "count"].div(df_pivot["count"], axis=0)) * 100).round(4)
    
    print(df_pivot["count"])
    df_pivot = df_pivot.drop(columns=["count"])
    
    df_pivot = df_pivot.transpose()

    sns.heatmap(df_pivot, annot=True, cmap="Blues", fmt=".2f",vmin=0.0, vmax=100.0)

# Intended for categorical feature comparison against several bool fields
# `data` should be a Pandas Data Frame
# `x` should be a string that refers to the categorical feature in `data`
# `y` should be an array of strings that refer to the several bool features in `data`
def plot_heatmap_of_bool_features_val_count(data, x, y):
    df_melted = data.melt(id_vars=x, value_vars=y, var_name="Bool", value_name="True")

    df_pivot = df_melted.groupby([x,"Bool"]).sum().reset_index().pivot(index=x, columns="Bool", values="True")

    total = data[x].value_counts()

    df_pivot = df_pivot.join(total)
    
    df_pivot.rename(columns={"count": "UniqueMembersOfParticular" + x}, inplace=True)
    
    df_pivot = df_pivot.transpose()
    # Plot
    sns.heatmap(df_pivot, annot=True, cmap="Blues", fmt=".2f")

# Used to reduce the number of categories in a feature by setting the value to "Other" for insufficient response quantity of specific catagories
# `df` should be a Pandas Data Frame
# `col_name` should be the string name of the column/feature you'd like to filter
# `threshold` should be the minimum count needed to maintain the categorical variable 

# For example, if `Age` gets a bunch of responses between 18 and 24, and you get a few 40s and one 60. You can use this function to reduce the 
# different categorical variables to simply 18 through 24 and "Other"
def filter_threshold(df, col_name, threshold):
    df_copy = df.copy() 
    counts = df_copy[col_name].value_counts()
    df_copy.loc[df_copy[col_name].isin(counts[counts < threshold].index), col_name] = "Other"
    return df_copy

Distribution of Features¶

The following code is used to filter out feature responses that are underrepresented or minimal in survey data, thus preventing inaccurate conclusions that could arise from only looking at a small sample of a population. Most of features graphed below have at least two prominent catagories for which to perform data analysis, which is sufficient. This includes:

  • Gender
  • Age (within the range of 18 to 24)
  • Employment Status (at least for Part-Time and Unemployed)
  • Vision
  • Area of Study Catagories
  • Number of Board Games Owned
  • Frequency of Play
  • Style
  • Chess Rating

However, a few fields did not get sufficient diverse responses to conclude anything, this includes:

  • WWU Status
    • Nearly everyone who took this survey was a student
  • Religiosity
    • Most people who filled out this survey were secular, and the largest minority religious group is simply too small to make any meaningful conclusions
  • Does person enjoys board games
    • The overwhelming majority of people who filled out this survey liked playing board games making relationships between demographics hard to draw
In [28]:
for feature in single_select_features:
    sns.countplot(x=feature, data=cleaned_df)
    plt.xticks(rotation=90) 
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Distribution of Features (cont'd)¶

The following questions were multi-select and resulted in bool fields. Because of this, we need to display the data slightly differently: totalling all "True"/selected values from the related features.

The multi-select questions were in regard to Race, Preferred Gameplay Elements, and Enjoyed Genres.

Preferred Gameplay Elements and Enjoyed Genres got a sufficient distribution of responses, however, Race did not receive sufficiently diverse responses for analysis.

In [29]:
true_counts = {}

for col in race_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [30]:
true_counts = {}

for col in element_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [31]:
true_counts = {}

for col in genre_bool_features:
    true_counts[col] = cleaned_df[cleaned_df[col] == 1].shape[0]

true_counts_series = pd.Series(true_counts)

sns.barplot(true_counts_series)
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image

(Random) Exploratory Analysis¶

This section is dedicated to graphing random sufficient features against one another in hopes of uncovering hidden relationships.

In [32]:
# There were many different responses for various types of non-binary gender identities, because of this, graphing the data becomes a bit muddled
# This is solved by grouping the non-binary responses into an "Other" category
genderForAnalysis_df = filter_threshold(cleaned_df,"Gender",25)
genderForAnalysis_df["Gender"].value_counts()
Out[32]:
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
In [33]:
genderForAnalysis_df = genderForAnalysis_df[genderForAnalysis_df["Gender"] != "Other"]
In [34]:
categories_against_category(genderForAnalysis_df,single_select_features,"Gender")
Gender  WWUStatus         
Man     WWU Student            98
        WWU Faculty Member      2
        Neither                 1
Woman   WWU Student           108
        WWU Faculty Member      1
Name: count, dtype: int64
No description has been provided for this image
Gender
Man      101
Woman    109
Name: count, dtype: int64
No description has been provided for this image
Gender  Age
Man      18    28
         19    23
         20    16
         21    11
         22     6
         23     6
        -1      5
         26     3
         24     2
         25     1
Woman    18    29
         19    23
         20    21
         21    15
         22     8
         23     4
         24     4
        -1      1
         25     1
         28     1
         29     1
         36     1
Name: count, dtype: int64
No description has been provided for this image
Gender  EmploymentStatus                 
Man     Unemployed                           51
        Employed part-time                   45
        Employed full-time                    2
        Seasonal worker during the summer     1
        Seeking employment                    1
        Self-employed                         1
Woman   Unemployed                           53
        Employed part-time                   50
        Prefer not to say                     2
        Employed full-time                    1
        One day a week babysitting            1
        Seasonally employed                   1
        Self-employed                         1
Name: count, dtype: int64
No description has been provided for this image
Gender  Vision           
Man     None                 61
        Glasses              27
        Both                  7
        Contacts              5
        Prefer not to say     1
Woman   None                 55
        Glasses              31
        Both                 18
        Contacts              5
Name: count, dtype: int64
No description has been provided for this image
Gender  Religiosity                                                
Man     Agnosticism                                                    26
        Atheism                                                        25
        No specific belief                                             24
        Christianity                                                   12
        Spiritual, not affiliated with a specific religion              6
        Judaism                                                         3
        Toaism                                                          1
        Science                                                         1
        Prefer not to say                                               1
        Islam                                                           1
        Buddhism                                                        1
Woman   No specific belief                                             26
        Atheism                                                        22
        Christianity                                                   20
        Agnosticism                                                    19
        Spiritual, not affiliated with a specific religion             17
        I believe a god exists but don’t follow any religious texts     1
        Lutheran                                                        1
        Judaism                                                         1
        Hinduism                                                        1
        Unitarian                                                       1
Name: count, dtype: int64
No description has been provided for this image
Gender  AOSCat           
Man     STEM                 48
        Arts & Humanities    16
        Unknown              14
        Social Studies        7
        Business              6
        Education             5
        Other                 3
        Health & Medicine     2
Woman   STEM                 35
        Arts & Humanities    21
        Unknown              16
        Education            10
        Health & Medicine     8
        Social Studies        8
        Other                 7
        Business              4
Name: count, dtype: int64
No description has been provided for this image
Gender  EnjoysBoardGames 
Man     Yes                   93
        No                     8
Woman   Yes                  106
        No                     2
        Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Gender  BoardGamesOwned  
Man     2 to 5               29
        1 or 2               24
        5 to 10              24
        0                    10
        10 to 20              7
        More than 20          7
        Prefer not to say     0
Woman   5 to 10              34
        2 to 5               31
        1 or 2               27
        10 to 20              9
        0                     4
        More than 20          4
        Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
Gender  FrequencyOfPlay      
Man     Several times a month    24
        Every few months         18
        Rarely/Seldom            17
        Weekly                   14
        Monthly                  13
        Several times a week     10
        Never                     3
        Daily                     2
        Prefer not to say         0
Woman   Every few months         28
        Several times a month    27
        Monthly                  21
        Weekly                   12
        Rarely/Seldom             9
        Several times a week      7
        Daily                     4
        Never                     1
        Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Gender  Style             
Man     Situation-Specific    38
        Casual                31
        Strategic             20
        Party/Social           6
        Not Interested         3
        Simple                 2
        Other                  1
Woman   Situation-Specific    43
        Casual                41
        Strategic             15
        Party/Social           6
        Simple                 4
Name: count, dtype: int64
No description has been provided for this image
Gender  ChessRating
Man     7              25
        8              17
        6              12
        4              11
        3               8
        5               8
        1               8
        9               4
        10              4
        2               4
        Unknown         0
Woman   1              28
        5              14
        2              14
        7              12
        4              10
        6               9
        8               8
        3               7
        Unknown         3
        10              3
        9               1
Name: count, dtype: int64
No description has been provided for this image
In [35]:
bools_against_category(genderForAnalysis_df,race_bool_features,"Gender")
bools_against_category(genderForAnalysis_df,element_bool_features,"Gender")
bools_against_category(genderForAnalysis_df,genre_bool_features,"Gender")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

All Single Select Against ChessRating Box Plot

There does appear to be a substantial difference in ratings between Men and Women within in the data.

In [36]:
genderForAnalysis_df["Gender"].value_counts()
Out[36]:
Gender
Woman    109
Man      101
Name: count, dtype: int64
In [37]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Gender",30)
modified_df = filter_threshold(modified_df,"Age",10)
modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
modified_df["Religiosity"] = modified_df["Religiosity"].value_counts()
modified_df = filter_threshold(modified_df,"AOSCat",30)


features = single_select_features.copy()
features.remove("WWUStatus")
for feature in features:
    print(modified_df[feature].value_counts())
    plt.figure(figsize=(20, 10))
    sns.boxenplot(x=modified_df[feature],y=modified_df["ChessRating"])
    plt.ylim(reversed(plt.ylim()))
    plt.xticks(rotation=90)
    plt.show()
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
No description has been provided for this image
Age
18       69
19       58
20       41
21       28
Other    23
22       15
23       10
Name: count, dtype: int64
No description has been provided for this image
EmploymentStatus
Unemployed            124
Employed part-time    107
Other                  13
Name: count, dtype: int64
No description has been provided for this image
Vision
None                 132
Glasses               76
Both                  25
Contacts              10
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Series([], Name: count, dtype: int64)
No description has been provided for this image
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
EnjoysBoardGames
Yes                  231
No                    12
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
BoardGamesOwned
2 to 5               73
5 to 10              60
1 or 2               59
10 to 20             20
0                    19
More than 20         13
Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
FrequencyOfPlay
Several times a month    62
Every few months         54
Monthly                  37
Weekly                   30
Rarely/Seldom            30
Several times a week     19
Daily                     7
Never                     5
Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Style
Situation-Specific    91
Casual                79
Strategic             42
Party/Social          19
Simple                 8
Not Interested         4
Other                  1
Name: count, dtype: int64
No description has been provided for this image
ChessRating
1          43
7          43
4          26
8          26
5          25
2          23
6          23
3          18
10          9
9           5
Unknown     3
Name: count, dtype: int64
No description has been provided for this image
In [38]:
# Concatenate the boolean columns with the "ChessRating" column
df_bool = cleaned_df[element_bool_features + ['ChessRating']]

# Reshape your DataFrame so that each boolean feature and its corresponding "ChessRating" are in a single row
df_melt = df_bool.melt(id_vars='ChessRating', var_name='Feature', value_name='Value')

# Select only the rows where the category is marked as True
df_melt = df_melt[df_melt['Value'] == 1]

# Now you can plot everything on the same plot
plt.figure(figsize=(20, 10))
sns.boxenplot(x='Feature', y='ChessRating', data=df_melt)
plt.ylim(reversed(plt.ylim()))
plt.xticks(rotation=90)
plt.title('Distribution of ChessRating for Each Feature')
plt.show()
No description has been provided for this image
In [39]:
# Concatenate the boolean columns with the "ChessRating" column
df_bool = cleaned_df[genre_bool_features + ['ChessRating']]

# Reshape your DataFrame so that each boolean feature and its corresponding "ChessRating" are in a single row
df_melt = df_bool.melt(id_vars='ChessRating', var_name='Feature', value_name='Value')

# Select only the rows where the category is marked as True
df_melt = df_melt[df_melt['Value'] == 1]

# Now you can plot everything on the same plot
plt.figure(figsize=(20, 10))
sns.boxenplot(x='Feature', y='ChessRating', data=df_melt)
plt.ylim(reversed(plt.ylim()))
plt.xticks(rotation=90)
plt.title('Distribution of ChessRating for Each Feature')
plt.show()
No description has been provided for this image
In [40]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Gender",30)
modified_df = filter_threshold(modified_df,"Age",10)
modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
modified_df["Religiosity"] = modified_df["Religiosity"].map(religious_bucketing)
modified_df = filter_threshold(modified_df,"AOSCat",30)

features = single_select_features.copy()
features.remove("WWUStatus")
for feature in features:
    print(modified_df[feature].value_counts())
    bools_against_category(modified_df,element_bool_features,feature)
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
No description has been provided for this image
Age
18       69
19       58
20       41
21       28
Other    23
22       15
23       10
Name: count, dtype: int64
No description has been provided for this image
EmploymentStatus
Unemployed            124
Employed part-time    107
Other                  13
Name: count, dtype: int64
No description has been provided for this image
Vision
None                 132
Glasses               76
Both                  25
Contacts              10
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Religiosity
Not Religious    197
Religious         46
Name: count, dtype: int64
No description has been provided for this image
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
EnjoysBoardGames
Yes                  231
No                    12
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
BoardGamesOwned
2 to 5               73
5 to 10              60
1 or 2               59
10 to 20             20
0                    19
More than 20         13
Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
FrequencyOfPlay
Several times a month    62
Every few months         54
Monthly                  37
Weekly                   30
Rarely/Seldom            30
Several times a week     19
Daily                     7
Never                     5
Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Style
Situation-Specific    91
Casual                79
Strategic             42
Party/Social          19
Simple                 8
Not Interested         4
Other                  1
Name: count, dtype: int64
No description has been provided for this image
ChessRating
1          43
7          43
4          26
8          26
5          25
2          23
6          23
3          18
10          9
9           5
Unknown     3
Name: count, dtype: int64
No description has been provided for this image
In [41]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Gender",30)
modified_df = filter_threshold(modified_df,"Age",10)
modified_df = filter_threshold(modified_df,"EmploymentStatus",30)
modified_df["Religiosity"] = modified_df["Religiosity"].map(religious_bucketing)
modified_df = filter_threshold(modified_df,"AOSCat",30)

features = single_select_features.copy()
features.remove("WWUStatus")
for feature in features:
    print(modified_df[feature].value_counts())
    bools_against_category(modified_df,genre_bool_features,feature)
Gender
Woman    109
Man      101
Other     34
Name: count, dtype: int64
No description has been provided for this image
Age
18       69
19       58
20       41
21       28
Other    23
22       15
23       10
Name: count, dtype: int64
No description has been provided for this image
EmploymentStatus
Unemployed            124
Employed part-time    107
Other                  13
Name: count, dtype: int64
No description has been provided for this image
Vision
None                 132
Glasses               76
Both                  25
Contacts              10
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Religiosity
Not Religious    197
Religious         46
Name: count, dtype: int64
No description has been provided for this image
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
EnjoysBoardGames
Yes                  231
No                    12
Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
BoardGamesOwned
2 to 5               73
5 to 10              60
1 or 2               59
10 to 20             20
0                    19
More than 20         13
Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
FrequencyOfPlay
Several times a month    62
Every few months         54
Monthly                  37
Weekly                   30
Rarely/Seldom            30
Several times a week     19
Daily                     7
Never                     5
Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Style
Situation-Specific    91
Casual                79
Strategic             42
Party/Social          19
Simple                 8
Not Interested         4
Other                  1
Name: count, dtype: int64
No description has been provided for this image
ChessRating
1          43
7          43
4          26
8          26
5          25
2          23
6          23
3          18
10          9
9           5
Unknown     3
Name: count, dtype: int64
No description has been provided for this image
In [42]:
selected_rows = cleaned_df.copy()
selected_rows["FrequencyOfPlay"] = cleaned_df["FrequencyOfPlay"].apply(lambda x: "At Least Weekly" if x in ["Daily","Several times a week","Weekly"] else x)

selected_rows["FrequencyOfPlay"] = pd.Categorical(selected_rows["FrequencyOfPlay"],
    categories=["At Least Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say"],ordered=True)

bools_against_category(selected_rows,element_bool_features,"BoardGamesOwned")
bools_against_category(selected_rows,genre_bool_features,"BoardGamesOwned")
No description has been provided for this image
No description has been provided for this image
In [43]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"Style",20)
modified_df = modified_df[modified_df["Style"] != "Other"]


print(modified_df["Style"].value_counts())
bools_against_category(modified_df,element_bool_features,"Style")
bools_against_category(modified_df,genre_bool_features,"Style")
Style
Situation-Specific    91
Casual                79
Strategic             42
Name: count, dtype: int64
No description has been provided for this image
No description has been provided for this image
In [44]:
modified_df = cleaned_df.copy()
modified_df = filter_threshold(modified_df,"AOSCat",20)

print(modified_df["AOSCat"].value_counts())

features = single_select_features.copy()
features.remove("WWUStatus")
categories_against_category(modified_df,features,"AOSCat")
AOSCat
STEM                 95
Other                65
Arts & Humanities    47
Unknown              37
Name: count, dtype: int64
AOSCat             Gender                   
Arts & Humanities  Woman                        21
                   Man                          16
                   Non-binary                    6
                   Genderfluid                   2
                   Gender queer                  1
                   Gender-fluid                  1
Other              Woman                        37
                   Man                          23
                   Non-binary                    4
                   Genderqueer                   1
STEM               Man                          48
                   Woman                        35
                   Non-binary                    8
                   unsure                        1
                   girl thing                    1
                   Prefer not to say             1
                   Genderfluid                   1
Unknown            Woman                        16
                   Man                          14
                   Non-binary                    5
                   Prefer not to say             1
                   Wouldn't you like to know     1
Name: count, dtype: int64
No description has been provided for this image
AOSCat             Age
Arts & Humanities   18    18
                    20    11
                    19     6
                    21     6
                    23     4
                    22     1
                    24     1
Other               18    16
                    19    15
                    20    11
                    22    10
                    21     9
                    23     2
                   -1      1
                    24     1
STEM                18    23
                    19    22
                    20    16
                    21    12
                   -1      4
                    22     4
                    23     4
                    26     4
                    24     2
                    25     2
                    28     1
                    29     1
Unknown             19    15
                    18    12
                   -1      3
                    20     3
                    24     2
                    21     1
                    36     1
Name: count, dtype: int64
No description has been provided for this image
AOSCat             EmploymentStatus                 
Arts & Humanities  Unemployed                           24
                   Employed part-time                   19
                   Employed full-time                    2
                   Self-employed                         1
                   Prefer not to say                     1
Other              Unemployed                           36
                   Employed part-time                   28
                   Seasonally employed                   1
STEM               Employed part-time                   51
                   Unemployed                           39
                   Employed full-time                    1
                   Prefer not to say                     1
                   Seasonal worker during the summer     1
                   Self-employed                         1
                   One day a week babysitting            1
Unknown            Unemployed                           25
                   Employed part-time                    9
                   Employed full-time                    1
                   Seeking employment                    1
                   Self-employed                         1
Name: count, dtype: int64
No description has been provided for this image
AOSCat             Vision           
Arts & Humanities  None                 23
                   Glasses              19
                   Both                  5
Other              None                 31
                   Glasses              20
                   Both                  9
                   Contacts              5
STEM               None                 53
                   Glasses              27
                   Both                  9
                   Contacts              5
                   Prefer not to say     1
Unknown            None                 25
                   Glasses              10
                   Both                  2
Name: count, dtype: int64
No description has been provided for this image
AOSCat             Religiosity                                                
Arts & Humanities  No specific belief                                             11
                   Atheism                                                        10
                   Agnosticism                                                     7
                   Christianity                                                    7
                   Spiritual, not affiliated with a specific religion              5
                   Judaism                                                         2
                   Pagan                                                           1
                   Science                                                         1
                   Toaism                                                          1
                   pagan                                                           1
                   Paganism                                                        1
Other              Agnosticism                                                    16
                   Christianity                                                   13
                   Spiritual, not affiliated with a specific religion             12
                   No specific belief                                             11
                   Atheism                                                        11
                   Lutheran                                                        1
                   Hinduism                                                        1
STEM               Atheism                                                        28
                   No specific belief                                             23
                   Agnosticism                                                    17
                   Spiritual, not affiliated with a specific religion             11
                   Christianity                                                   10
                   Unitarian                                                       1
                   being with oneself in connection to everything                  1
                   Prefer not to say                                               1
                   Judaism                                                         1
                   Islam                                                           1
                   Buddhism                                                        1
Unknown            No specific belief                                             10
                   Atheism                                                        10
                   Agnosticism                                                     9
                   Spiritual, not affiliated with a specific religion              3
                   Christianity                                                    2
                   Prefer not to say                                               1
                   Judaism                                                         1
                   I believe a god exists but don’t follow any religious texts     1
Name: count, dtype: int64
No description has been provided for this image
AOSCat
Arts & Humanities    47
Other                65
STEM                 95
Unknown              37
Name: count, dtype: int64
No description has been provided for this image
AOSCat             EnjoysBoardGames 
Arts & Humanities  Yes                  46
                   No                    1
Other              Yes                  64
                   No                    1
STEM               Yes                  87
                   No                    7
                   Prefer not to say     1
Unknown            Yes                  34
                   No                    3
Name: count, dtype: int64
No description has been provided for this image
AOSCat             BoardGamesOwned  
Arts & Humanities  1 or 2               13
                   5 to 10              13
                   2 to 5                9
                   0                     5
                   10 to 20              4
                   More than 20          3
                   Prefer not to say     0
Other              2 to 5               21
                   1 or 2               18
                   5 to 10              15
                   10 to 20              5
                   0                     4
                   More than 20          2
                   Prefer not to say     0
STEM               2 to 5               33
                   5 to 10              25
                   1 or 2               19
                   10 to 20              8
                   0                     7
                   More than 20          3
                   Prefer not to say     0
Unknown            2 to 5               10
                   1 or 2                9
                   5 to 10               7
                   More than 20          5
                   10 to 20              3
                   0                     3
                   Prefer not to say     0
Name: count, dtype: int64
No description has been provided for this image
AOSCat             FrequencyOfPlay      
Arts & Humanities  Several times a month    15
                   Every few months         14
                   Rarely/Seldom             5
                   Monthly                   4
                   Several times a week      3
                   Never                     3
                   Weekly                    2
                   Daily                     1
                   Prefer not to say         0
Other              Several times a month    19
                   Every few months         17
                   Weekly                   10
                   Rarely/Seldom             8
                   Monthly                   8
                   Several times a week      2
                   Daily                     1
                   Prefer not to say         0
                   Never                     0
STEM               Several times a month    21
                   Every few months         18
                   Monthly                  18
                   Rarely/Seldom            14
                   Several times a week     10
                   Weekly                    9
                   Daily                     4
                   Never                     1
                   Prefer not to say         0
Unknown            Weekly                    9
                   Monthly                   7
                   Several times a month     7
                   Every few months          5
                   Several times a week      4
                   Rarely/Seldom             3
                   Never                     1
                   Daily                     1
                   Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
AOSCat             Style             
Arts & Humanities  Situation-Specific    18
                   Casual                16
                   Party/Social           7
                   Strategic              4
                   Not Interested         1
                   Simple                 1
Other              Situation-Specific    30
                   Casual                20
                   Strategic              7
                   Party/Social           5
                   Simple                 3
STEM               Casual                31
                   Situation-Specific    29
                   Strategic             23
                   Party/Social           7
                   Not Interested         2
                   Simple                 2
                   Other                  1
Unknown            Situation-Specific    14
                   Casual                12
                   Strategic              8
                   Simple                 2
                   Not Interested         1
Name: count, dtype: int64
No description has been provided for this image
AOSCat             ChessRating
Arts & Humanities  1              11
                   7               8
                   5               7
                   8               6
                   6               5
                   2               4
                   Unknown         2
                   4               2
                   3               2
                   10              0
                   9               0
Other              1              12
                   7              12
                   4              10
                   5               8
                   2               8
                   8               5
                   9               3
                   3               3
                   6               2
                   10              1
                   Unknown         1
STEM               1              17
                   7              16
                   6              15
                   8              11
                   4              10
                   3               8
                   10              6
                   2               6
                   5               5
                   9               1
                   Unknown         0
Unknown            7               7
                   2               5
                   3               5
                   5               5
                   4               4
                   8               4
                   1               3
                   10              2
                   6               1
                   9               1
                   Unknown         0
Name: count, dtype: int64
No description has been provided for this image
In [45]:
bools_against_category(modified_df,element_bool_features,"AOSCat")
bools_against_category(modified_df,genre_bool_features,"AOSCat")
No description has been provided for this image
No description has been provided for this image

Machine Learning Clustering for Exploratory Analysis¶

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.calibration import LabelEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier


from sklearn import metrics
from sklearn.cluster import KMeans

data = cleaned_df.copy()
X = data.drop(columns=["EnjoyedBoardGames","AOS"])

onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), ["WWUStatus","EmploymentStatus","Vision","Religiosity","EnjoysBoardGames","Gender","Style","AOSCat"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"]),
        ("scale numeric types", StandardScaler(), ["Age"])                      
    ], remainder="passthrough"
    )


X_transformed = onehot_and_ordinal_transform.fit_transform(X)



for i in range(2,10):
    k_means_model = KMeans(n_clusters=i, random_state=5, n_init=10)
    clusters = k_means_model.fit_predict(X_transformed)
    score = metrics.silhouette_score(X_transformed, clusters)
    print(f'Fitting for {i} clusters')
    print(f'score: {score}')
    print()
Fitting for 2 clusters
score: 0.11863466543320195

Fitting for 3 clusters
score: 0.05927103971485946

Fitting for 4 clusters
score: 0.053179399224910384

Fitting for 5 clusters
score: 0.050326176098971546

Fitting for 6 clusters
score: 0.053239186913892206

Fitting for 7 clusters
score: 0.0531029232311401

Fitting for 8 clusters
score: 0.049338481370941906

Fitting for 9 clusters
score: 0.0474473146034939

In [47]:
k_means_model = KMeans(n_clusters=2, random_state=5, n_init=10)
clusters = k_means_model.fit_predict(X_transformed)
score = metrics.silhouette_score(X_transformed, clusters)
print(f'Fitting for {2} clusters')
print(f'score: {score}')
print()
Fitting for 2 clusters
score: 0.11863466543320195

In [48]:
X["Cluster"] = k_means_model.fit_predict(X_transformed)
X
Out[48]:
WWUStatus Gender Age EmploymentStatus Vision Religiosity EnjoysBoardGames BoardGamesOwned FrequencyOfPlay Style ... TransportationIsEnjoyedGenre TravelIsEnjoyedGenre TriviaIsEnjoyedGenre WarIsEnjoyedGenre WordIsEnjoyedGenre WorkerPlacementIsEnjoyedGenre WorldWarIIIsEnjoyedGenre ZombiesIsEnjoyedGenre AOSCat Cluster
0 WWU Student Man 22 Employed part-time None No specific belief No 0 Never Not Interested ... 0 0 0 0 0 0 0 0 STEM 0
1 WWU Student Man 20 Employed part-time Contacts Atheism Yes 2 to 5 Rarely/Seldom Party/Social ... 0 0 0 0 0 0 0 0 STEM 0
2 WWU Student Non-binary 18 Unemployed None Atheism Yes 10 to 20 Several times a week Strategic ... 0 0 0 0 0 0 0 0 STEM 0
3 WWU Student Man 21 Employed part-time None Atheism Yes 10 to 20 Several times a month Strategic ... 0 0 0 1 0 0 1 0 STEM 1
4 WWU Student Man 22 Employed part-time None Christianity Yes More than 20 Weekly Situation-Specific ... 0 0 1 1 1 1 1 1 STEM 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
239 WWU Student Man 19 Employed part-time None No specific belief No 2 to 5 Rarely/Seldom Situation-Specific ... 0 0 0 0 0 0 0 0 STEM 0
240 WWU Student Non-binary 19 Unemployed Glasses Agnosticism Yes 1 or 2 Rarely/Seldom Casual ... 0 0 0 0 0 0 0 0 Social Studies 0
241 WWU Student Non-binary 19 Employed part-time None Agnosticism Yes 2 to 5 Several times a month Situation-Specific ... 0 1 1 0 1 0 0 0 STEM 1
242 WWU Student Man 18 Unemployed Glasses Agnosticism Yes 5 to 10 Several times a week Casual ... 1 0 1 0 1 1 0 0 Arts & Humanities 1
243 WWU Student Man 19 Unemployed Glasses Agnosticism Yes 2 to 5 Rarely/Seldom Party/Social ... 0 0 0 0 0 0 0 0 STEM 0

244 rows × 75 columns

In [49]:
X["Cluster"].value_counts()
Out[49]:
Cluster
0    187
1     57
Name: count, dtype: int64
In [50]:
X["Cluster"].value_counts(normalize=True)
Out[50]:
Cluster
0    0.766393
1    0.233607
Name: proportion, dtype: float64
In [51]:
categories_against_category(X,single_select_features,"Cluster")
Cluster  WWUStatus         
0        WWU Student           184
         WWU Faculty Member      3
1        WWU Student            56
         Neither                 1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Gender                   
0        Woman                        82
         Man                          78
         Non-binary                   17
         Genderfluid                   3
         Prefer not to say             2
         Gender-fluid                  1
         Genderqueer                   1
         Wouldn't you like to know     1
         girl thing                    1
         unsure                        1
1        Woman                        27
         Man                          23
         Non-binary                    6
         Gender queer                  1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Age
0         18    57
          19    38
          20    30
          21    24
          22    11
          23     9
         -1      7
          24     4
          26     3
          25     2
          36     1
          28     1
1         19    20
          18    12
          20    11
          21     4
          22     4
          24     2
          26     1
          23     1
         -1      1
          29     1
Name: count, dtype: int64
No description has been provided for this image
Cluster  EmploymentStatus                 
0        Unemployed                           92
         Employed part-time                   85
         Employed full-time                    3
         Self-employed                         3
         One day a week babysitting            1
         Prefer not to say                     1
         Seasonal worker during the summer     1
         Seeking employment                    1
1        Unemployed                           32
         Employed part-time                   22
         Employed full-time                    1
         Prefer not to say                     1
         Seasonally employed                   1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Vision           
0        None                 108
         Glasses               53
         Both                  16
         Contacts              10
1        None                  24
         Glasses               23
         Both                   9
         Prefer not to say      1
Name: count, dtype: int64
No description has been provided for this image
Cluster  Religiosity                                                
0        Atheism                                                        46
         No specific belief                                             40
         Agnosticism                                                    37
         Spiritual, not affiliated with a specific religion             27
         Christianity                                                   22
         Judaism                                                         3
         Prefer not to say                                               2
         pagan                                                           1
         Science                                                         1
         being with oneself in connection to everything                  1
         Toaism                                                          1
         Paganism                                                        1
         Pagan                                                           1
         Lutheran                                                        1
         Islam                                                           1
         I believe a god exists but don’t follow any religious texts     1
         Hinduism                                                        1
1        No specific belief                                             15
         Atheism                                                        13
         Agnosticism                                                    12
         Christianity                                                   10
         Spiritual, not affiliated with a specific religion              4
         Judaism                                                         1
         Buddhism                                                        1
         Unitarian                                                       1
Name: count, dtype: int64
No description has been provided for this image
Cluster  AOSCat           
0        STEM                 74
         Arts & Humanities    33
         Unknown              26
         Education            16
         Social Studies       14
         Business              8
         Health & Medicine     8
         Other                 8
1        STEM                 21
         Arts & Humanities    14
         Unknown              11
         Social Studies        5
         Business              2
         Health & Medicine     2
         Other                 2
Name: count, dtype: int64
No description has been provided for this image
Cluster  EnjoysBoardGames 
0        Yes                  175
         No                    11
         Prefer not to say      1
1        Yes                   56
         No                     1
Name: count, dtype: int64
No description has been provided for this image
Cluster  BoardGamesOwned  
0        2 to 5               62
         1 or 2               54
         5 to 10              40
         0                    19
         10 to 20              8
         More than 20          4
         Prefer not to say     0
1        5 to 10              20
         10 to 20             12
         2 to 5               11
         More than 20          9
         1 or 2                5
         Prefer not to say     0
         0                     0
Name: count, dtype: int64
No description has been provided for this image
Cluster  FrequencyOfPlay      
0        Every few months         44
         Several times a month    43
         Monthly                  29
         Rarely/Seldom            28
         Weekly                   23
         Several times a week     10
         Daily                     5
         Never                     5
         Prefer not to say         0
1        Several times a month    19
         Every few months         10
         Several times a week      9
         Monthly                   8
         Weekly                    7
         Rarely/Seldom             2
         Daily                     2
         Never                     0
         Prefer not to say         0
Name: count, dtype: int64
No description has been provided for this image
Cluster  Style             
0        Casual                68
         Situation-Specific    54
         Strategic             33
         Party/Social          19
         Simple                 8
         Not Interested         4
         Other                  1
1        Situation-Specific    37
         Casual                11
         Strategic              9
Name: count, dtype: int64
No description has been provided for this image
Cluster  ChessRating
0        1              37
         7              36
         2              21
         4              18
         8              18
         5              16
         6              16
         3              15
         10              5
         9               4
         Unknown         1
1        5               9
         4               8
         8               8
         6               7
         7               7
         1               6
         10              4
         3               3
         Unknown         2
         2               2
         9               1
Name: count, dtype: int64
No description has been provided for this image
In [52]:
bools_against_category(X,race_bool_features,"Cluster")
bools_against_category(X,element_bool_features,"Cluster")
bools_against_category(X,genre_bool_features,"Cluster")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [53]:
from sklearn.cluster import SpectralClustering

data = cleaned_df.copy()
X = data.drop(columns=["EnjoyedBoardGames","AOS"])

onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), ["WWUStatus","EmploymentStatus","Vision","Religiosity","EnjoysBoardGames","Gender","Style","AOSCat"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"]),
        ("scale numeric types", StandardScaler(), ["Age"])                      
    ], remainder="passthrough"
    )


X_transformed = onehot_and_ordinal_transform.fit_transform(X)



for i in range(2,10):
    k_means_model = SpectralClustering(n_clusters=i, random_state=5, n_init=10)
    clusters = k_means_model.fit_predict(X_transformed)
    score = metrics.silhouette_score(X_transformed, clusters)
    print(f'Fitting for {i} clusters')
    print(f'score: {score}')
    print()
Fitting for 2 clusters
score: 0.22951660547844696

Fitting for 3 clusters
score: 0.14844192732350808

Fitting for 4 clusters
score: 0.11973800538607844

Fitting for 5 clusters
score: 0.1109678779048612

Fitting for 6 clusters
score: 0.020832703490430395

Fitting for 7 clusters
score: 0.0134036514841993

Fitting for 8 clusters
score: 0.029962589396250204

Fitting for 9 clusters
score: 0.03174289031861758

Machine Learning Predictive Model¶

First Attempt at Building a Machine Learning Model, Predict Gender from Board Game Preferences¶

In [54]:
# load dataset
data = cleaned_df.copy()
data = data[["Gender","BoardGamesOwned","FrequencyOfPlay","Style","ChessRating",
     
     "Party/Low-StakesIsPreferredElement","SocialDeduction/HiddenRoleIsPreferredElement","TriviaIsPreferredElement",

     "AnimalsIsEnjoyedGenre","CardIsEnjoyedGenre","FantasyIsEnjoyedGenre","HorrorIsEnjoyedGenre","LuckIsEnjoyedGenre",
     "PuzzleIsEnjoyedGenre","Role-PlayingIsEnjoyedGenre","RollandMoveIsEnjoyedGenre","ScienceFictionIsEnjoyedGenre",
     "SocialDeduction/HiddenRoleIsEnjoyedGenre","SportsIsEnjoyedGenre","TriviaIsEnjoyedGenre","WarIsEnjoyedGenre",
     "WorldWarIIIsEnjoyedGenre","ZombiesIsEnjoyedGenre"]]

# data = data.drop(columns=["WWUStatus","EmploymentStatus","Vision","Religiosity","AOS","EnjoysBoardGames",
#                           "EnjoyedBoardGames"])



data = filter_threshold(data,"Gender",20)
data = data[data["Gender"] != "Other"]
# data = filter_threshold(data,"AOSCat",15)
data = filter_threshold(data,"Style",15)
In [55]:
X, y = data.drop(columns=["Gender"]), data["Gender"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=40)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(), ["Style"]),
        ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
                                               ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
                                               ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
                                               ["BoardGamesOwned",
                                                "FrequencyOfPlay",
                                                "ChessRating"])                         
    ], remainder="passthrough"
)


logistic_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LogisticRegression(max_iter=10000)
)

k_neighbors_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    KNeighborsClassifier()
)

decision_tree_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    DecisionTreeClassifier()
)

random_forest_classifier_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    RandomForestClassifier(random_state=40)
)

dummy_classifier = DummyClassifier(strategy='most_frequent')


current_pipe = logistic_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=4,scoring=make_scorer(accuracy_score))}")
Model Cross Val Scores: [0.51724138 0.62068966 0.72413793 0.72413793]
Dummy Cross Val Scores: [0.48275862 0.48275862 0.44827586 0.44827586]
In [56]:
# Should be used when certain model performs well

current_pipe.fit(X_train,y_train)
conf_matrix = confusion_matrix(y_test, current_pipe.predict(X_test),labels=["Man","Woman","Other"])

print(conf_matrix)

print(f"Model Accuracy Score: {accuracy_score(current_pipe.predict(X_test),y_test)}")



dummy_classifier.fit(X_train, y_train)
y_pred = dummy_classifier.predict(X_test)

print(f"Dummy Accuracy Score: {accuracy_score(y_test, y_pred)}")
[[31 19  0]
 [16 37  0]
 [ 0  0  0]]
Model Accuracy Score: 0.5811965811965812
Dummy Accuracy Score: 0.4700854700854701

Second Attempt at Machine Learning Model, Predicting Different Board Game Stats Based on Demographics¶

In [57]:
# load dataset
data = cleaned_df.copy()
data = data[['Gender', 'Age', 
            'AOSCat',
       
            "BoardGamesOwned","FrequencyOfPlay","Style","ChessRating",]]

# data = data.drop(columns=["WWUStatus","EmploymentStatus","Vision","Religiosity","AOS","EnjoysBoardGames",
#                           "EnjoyedBoardGames"])



data = filter_threshold(data,"Gender",20)
# data = data[data["Gender"] != "Other"]
#filter_threshold(data,"Age",10)["Age"].value_counts()
# data = filter_threshold
data = filter_threshold(data,"AOSCat",15)
data = filter_threshold(data,"Style",15)

data["FrequencyOfPlay"] = data["FrequencyOfPlay"].apply(lambda x: "At Least Weekly" if x in ["Daily","Several times a week","Weekly"] else x)


data["FrequencyOfPlay"] = pd.Categorical(data["FrequencyOfPlay"],
    categories=["At Least Weekly","Several times a month","Monthly","Every few months","Rarely/Seldom","Never","Prefer not to say"],ordered=True)

data["ChessRating"] = data["ChessRating"].replace('Unknown', np.nan).astype(float)
In [58]:
data = data.dropna(subset=['ChessRating'])
In [59]:
len(data)
Out[59]:
241
In [60]:
X, y = data.drop(columns=["BoardGamesOwned","FrequencyOfPlay","Style","ChessRating"]), data["ChessRating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5, test_size=0.5, stratify=y, random_state=40)


onehot_and_ordinal_transform = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"), ["Gender","AOSCat", ]),
        # ("ordinal", make_pipeline(OrdinalEncoder(categories=[["Prefer not to say", "0", "1 or 2", "2 to 5", "5 to 10", "10 to 20", "More than 20"],
        #                                        ["Daily", "Several times a week", "Weekly", "Several times a month", "Monthly", "Every few months", "Rarely/Seldom", "Never", "Prefer not to say"],
        #                                        ["Unknown", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]]),StandardScaler()),
                                                
        #                                        ["BoardGamesOwned",
        #                                         "FrequencyOfPlay",
        #                                         "ChessRating"])                         
    ], remainder="passthrough"
)


logistic_pipe = make_pipeline(
    onehot_and_ordinal_transform,
    LogisticRegression(max_iter=10000)
)


dummy_classifier = DummyClassifier(strategy='uniform')


current_pipe = logistic_pipe

print(f"Model Cross Val Scores: {cross_val_score(current_pipe,X_train,y_train,cv=4)}")
print(f"Dummy Cross Val Scores: {cross_val_score(dummy_classifier,X_train,y_train,cv=4)}")
/opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=4.
  warnings.warn(
Model Cross Val Scores: [0.13333333 0.13333333 0.13333333 0.23333333]
Dummy Cross Val Scores: [0.13333333 0.03333333 0.06666667 0.1       ]
/opt/hostedtoolcache/Python/3.8.18/x64/lib/python3.8/site-packages/sklearn/model_selection/_split.py:737: UserWarning: The least populated class in y has only 3 members, which is less than n_splits=4.
  warnings.warn(
In [61]:
current_pipe.fit(X_train,y_train)

current_pipe.score(X_train,y_train)
Out[61]:
0.30833333333333335
In [62]:
current_pipe.fit(X_train,y_train)
conf_matrix = confusion_matrix(y_test, current_pipe.predict(X_test))

print(conf_matrix)

print(f"Model Accuracy Score: {accuracy_score(current_pipe.predict(X_test),y_test)}")



dummy_classifier.fit(X_train, y_train)
y_pred = dummy_classifier.predict(X_test)

print(f"Dummy Accuracy Score: {accuracy_score(y_test, y_pred)}")
[[11  0  0  1  3  2  5  0  0  0]
 [ 5  0  3  1  1  0  0  1  0  0]
 [ 2  0  0  2  0  0  5  0  0  0]
 [ 6  0  0  1  0  2  3  1  0  0]
 [ 4  1  1  0  1  0  6  0  0  0]
 [ 2  0  1  0  2  0  7  0  0  0]
 [ 8  1  0  0  0  2  9  2  0  0]
 [ 4  0  0  1  1  2  5  0  0  0]
 [ 0  0  0  1  0  0  1  0  0  0]
 [ 0  0  0  0  0  1  3  0  0  0]]
Model Accuracy Score: 0.18181818181818182
Dummy Accuracy Score: 0.1322314049586777
In [63]:
single_select_features
Out[63]:
['WWUStatus',
 'Gender',
 'Age',
 'EmploymentStatus',
 'Vision',
 'Religiosity',
 'AOSCat',
 'EnjoysBoardGames',
 'BoardGamesOwned',
 'FrequencyOfPlay',
 'Style',
 'ChessRating']